Look at individually

Pull up Vitals Sign info & RRT events from clinical event table.

In order to answer these questions:

1. Which of the vital signs are actually recorded the most? (e.g. between the different O2 measures)
     We would want to use these measurements -- the ones that are most accessible / most often recorded

2. How many measurements are available from the same encounter before a patient"s RRT event? What"s the time frame we have before RRT events?
    Use this info to help us shape how we timebox measurements for prediction.

In [ ]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
%matplotlib inline
import datetime as datetime
from impala.util import as_pandas
from impala.dbapi import connect

In [ ]:
# connect to impala
conn = connect(host=host="mycluster.domain.com", port=my_impala_port_number)

In [ ]:
# Make sure we"re pulling from the right location
cur = conn.cursor()
cur.execute("use my_db")

In [ ]:
query = """
SELECT \
      ce.clinical_event_id \
    , ce.event_id \
    , ce.encntr_id \
    , ce.person_id \
    , ce.event_cd \
    , cv_event_cd.description AS event_description \
    , ce.performed_dt_tm AS unix_performed_dt_tm \
    , from_unixtime(CAST(ce.performed_dt_tm  / 1000 as bigint)) AS performed_dt_tm \
    , ce.event_tag \
    , ce.result_val \
    , cv_result_units_cd.display AS result_units_display \
    , ce.result_time_units_cd \
    , ce.catalog_cd \
FROM clinical_event ce \
LEFT OUTER JOIN code_value cv_event_cd \
ON ce.event_cd = cv_event_cd.code_value \
LEFT OUTER JOIN code_value cv_result_units_cd \
ON ce.result_units_cd  = cv_result_units_cd.code_value \
WHERE ce.encntr_id IN ( SELECT DISTINCT encntr_id \
                       FROM clinical_event \
                       WHERE event_cd = '54411998' \
                       AND result_status_cd NOT IN ('31', '36') \
                       AND valid_until_dt_tm > unix_timestamp() \
                       AND event_class_cd not in ('654645') \
                       ORDER BY RAND() \
                       LIMIT 10 \
                      ) \
AND ce.event_cd IN ( \
                    '703306',\
                    '703501',\
                    '703511',\
                    '703516',\
                    '703540',\
                    '703558',\
                    '703565',\
                    '703569',\
                    '703960',\
                    '2700541',\
                    '2700653',\
                    '3623994',\
                    '4674677',\
                    '4686698',\
                    '54411998'\
                    )\
ORDER BY ce.encntr_id, ce.performed_dt_tm;
"""

In [ ]:
cur.execute(query)
df = as_pandas(cur)

In [ ]:
df.head()

In [ ]:
df.encntr_id.value_counts()

In [ ]:
# pull off all the encounter ids into an array to loop over
df.encntr_id.value_counts().index.get_values()

In [ ]:
df_enc = df[df.encntr_id=='108285121']
rrt_times = df_enc.unix_performed_dt_tm[df_enc.event_cd == '54411998'].unique()

In [ ]:
rrt_times

In [ ]:
newdf = df_enc.groupby(['event_cd']).count().iloc[:,0].to_frame()

In [ ]:
newdf

In [ ]:
print newdf.sort_values('clinical_event_id', ascending = False)

In [ ]:
# Function to show how many values of different vitals signs for event_cd appeared in the intervals between RRT events 
# for this encounterid.

# Start with a df from clinical_event table, where each line contains either a vital sign reading or an RRT event form
# must include encntr_id, event_cd, unixdatestamp.

def output_vitalsbeforeRRT(df):
    '''
    Function to show how many values of different vitals signs for event_cd appeared in the intervals between RRT events 
for this encounterid Start with a df from clinical_event table, where each line contains either a vital sign reading or 
an RRT event form must include encntr_id, event_cd, unixdatestamp
    '''
    
    encntr_array = df.encntr_id.value_counts().index.get_values()
    
    # loop through all encounter ids
    for encounter_id in encntr_array:
        print "Encounter id: {0}".format(encounter_id)
        
        df_enc = df[df.encntr_id==encounter_id]
        
        # find the different unique RRT times
        rrt_times = df_enc.unix_performed_dt_tm[df_enc.event_cd == '54411998'].unique()    
        print rrt_times
        # go through each interval & print counts for values
        time_prev=0
        
        for time_now in rrt_times:
            df_enc_sub = df_enc[(df_enc.unix_performed_dt_tm < time_now) & (df_enc.unix_performed_dt_tm > time_prev)]
            print "time_now: {0}; time_prev: {1}".format(time_now, time_prev)
            print df_enc_sub.groupby(['event_cd']).count().iloc[:,0].to_frame().sort_values('clinical_event_id', ascending = False)
            print "------------------"
            time_prev = time_now
            
        print "================================"
        
    return "Done"

In [ ]:
output_vitalsbeforeRRT(df)

Time series of vitals


In [ ]:
import pandas as pd

In [ ]:
query_vitals = """
SELECT \
      ce.clinical_event_id \
    , ce.event_id \
    , ce.encntr_id 
    , ce.person_id \
    , ce.event_cd \ 
    , cv_event_cd.description AS event_description \
    , ce.performed_dt_tm AS unix_performed_dt_tm \
    , from_unixtime(CAST(ce.performed_dt_tm / 1000 as bigint)) AS performed_dt_tm \
    , ce.event_tag \ 
    , ce.result_val \
    , cv_result_units_cd.display AS result_units_display \
    , ce.result_time_units_cd \
    , ce.catalog_cd \
FROM clinical_event ce \
LEFT OUTER JOIN code_value cv_event_cd \
ON ce.event_cd = cv_event_cd.code_value \
LEFT OUTER JOIN code_value cv_result_units_cd \ 
ON ce.result_units_cd = cv_result_units_cd.code_value \
WHERE ce.encntr_id ='105479870' \
AND ce.event_cd IN ( \
                    '703306',\
                    '703501',\
                    '703511',\
                    '703516',\
                    '703540',\
                    '703558',\
                    '703565',\
                    '703569',\
                    '2700541',\
                    '2700653',\
                    '3623994',\
                    '4674677',\
                    '4686698',\
                    '4690633',\
                    '54411998',\
                    '54408578'\
                    )\
ORDER BY ce.encntr_id, ce.performed_dt_tm;
"""
cur.execute(query_vitals)
df = as_pandas(cur)

In [ ]:
df["timestamp"] = df.performed_dt_tm.apply(lambda x: pd.Timestamp(x))

In [ ]:
# Exploring different codes below
# df[df.event_cd=='54411998']
# df[df.event_cd=='54408578']
# df[df.event_cd=='54408578']['event_tag'].get_values()[0]
print "RRT reason for call: {0}".format(df[df.event_cd=='54408578']['event_tag'].get_values()[0])

In [ ]:
df['result_val'] = pd.to_numeric(df.result_val, errors = 'coerce')

In [ ]:
df.dtypes

In [ ]:
df_new = pd.pivot_table(df, values="result_val", index='timestamp', columns = 'event_description')

In [ ]:
df_new.tail()

In [ ]:
# Grab unique RRT event times:
RRTEventTimes = df.unix_performed_dt_tm[df.event_cd == "54411998"].unique()
RRT_times = [pd.to_datetime(x, unit='ms') for x in RRTEventTimes]

In [ ]:
df_new.columns.get_values()

In [ ]:
test = df_new.columns.get_values()

In [ ]:
colnames = [item for item in test if item not in ["RRT Event Form", "RRT Primary Reason for Call"]]

In [ ]:
colnames

In [ ]:
# pull dates of encounter so we can set xlim up the right way.
query_minmaxtimes = "SELECT arrive_dt_tm, depart_dt_tm FROM encounter WHERE encntr_id = '105479870';"
cur.execute(query_minmaxtimes)
arr_dep = cur.fetchall()
arr = arr_dep[0][0]
dep = arr_dep[0][1]

In [ ]:
arr = arr_dep[0][0]

In [ ]:
dep = arr_dep[0][1]

In [ ]:
print arr; print dep

In [ ]:
arr = pd.to_datetime(arr, unit="ms")
dep = pd.to_datetime(dep, unit='ms')

In [ ]:
# Here is an example of a random patient's vitals over time 

for name in colnames:
    plt.figure(figsize=(14,6))
    mask = np.isfinite(df_new[name])
    plt.plot(df_new[mask].index.to_pydatetime(), df_new[name][mask], '-o')
    plt.title(name)
    plt.tick_params(labelsize=14)
    plt.xlim([arr.to_pydatetime(), dep.to_pydatetime()])

    # add vertical lines when RRT Events took place
    for val in RRT_times:
        plt.axvline(x=val.to_pydatetime(), linewidth = 2)

Pull the code together in a function. Select encounter randomly!


In [ ]:
def timeseries_randomRRTpatient():
    '''
    Query for & plot the vital signs of a patient with an RRT event.
    Currently written to return a random encounter.
    '''
    
    query_encid = """SELECT DISTINCT encntr_id \
                     FROM clinical_event \
                     WHERE event_cd = '54411998' \
                     AND result_status_cd NOT IN ('31', '36') \
                     AND valid_until_dt_tm > unix_timestamp() \
                     AND event_class_cd not in ('654645');"""
    
    # Note - have note checked that encounter has valid end date...
    # And haven't separated by location.
    cur.execute(query_encid)
    RRT_enc_ids = cur.fetchall()
    RRTencids = [enc[0] for enc in RRT_enc_ids]
    enc_id = np.random.choice(RRTencids)
    
    
    query_vitals = """
    SELECT \
          ce.clinical_event_id \
        , ce.event_id \
        , ce.encntr_id \
        , ce.person_id \
        , ce.event_cd \
        , cv_event_cd.description AS event_description \
        , ce.performed_dt_tm AS unix_performed_dt_tm \
        , from_unixtime(CAST(ce.performed_dt_tm  / 1000 as bigint)) AS performed_dt_tm \
        , ce.event_tag \
        , ce.result_val \
        , cv_result_units_cd.display AS result_units_display \
        , ce.result_time_units_cd \
        , ce.catalog_cd \
    FROM clinical_event ce \
    LEFT OUTER JOIN code_value cv_event_cd \
    ON ce.event_cd = cv_event_cd.code_value \
    LEFT OUTER JOIN code_value cv_result_units_cd \ 
    ON ce.result_units_cd = cv_result_units_cd.code_value \
    WHERE ce.encntr_id = '{0}' \
    AND ce.event_cd IN ( \
                        '703306',\
                        '703501',\
                        '703511',\
                        '703516',\
                        '703540',\
                        '703558',\
                        '703565',\
                        '703569',\
                        '2700541',\
                        '2700653',\
                        '3623994',\
                        '4674677',\
                        '4686698',\
                        '4690633',\
                        '54411998',\
                        '54408578'\
                        )\
    ORDER BY ce.encntr_id, ce.performed_dt_tm;""".format(enc_id)

    cur.execute(query_vitals)
    df = as_pandas(cur)

    df["timestamp"] = df.performed_dt_tm.apply(lambda x: pd.Timestamp(x))
    df['result_val'] = pd.to_numeric(df.result_val, errors = 'coerce')

    # pivot the data so it's in the format we need for plotting
    df_new = pd.pivot_table(df, values="result_val", index='timestamp', columns = 'event_description')

    # Grab unique RRT event times & convert to right format
    RRTEventTimes = df.unix_performed_dt_tm[df.event_cd == "54411998"].unique()
    RRT_times = [pd.to_datetime(x, unit='ms') for x in RRTEventTimes]

    # get columns to query
    init_colnames = df_new.columns.get_values()
    colnames = [item for item in init_colnames if item not in ["RRT Event Form", "RRT Primary Reason for Call"]]

    # print encounter id
    print "encntr_id: {0}".format(df.encntr_id[0])
    print "RRT reason for call: {0}".format(df[df.event_cd=='54408578']['event_tag'].get_values()[0])

    # pull up & print reason for visit
    query_reason = "SELECT reason_for_visit FROM encounter WHERE encntr_id = '{0}';".format(df.encntr_id[0])
    cur.execute(query_reason)
    reason = cur.fetchall()
    reason = reason[0][0]
    print "Reason for encounter: {0}".format(reason)
    
    # pull dates of encounter so we can set xlim up the right way.
    query_minmaxtimes = "SELECT arrive_dt_tm, depart_dt_tm FROM encounter WHERE encntr_id = '{0}';".format(df.encntr_id[0])
    cur.execute(query_minmaxtimes)
    arr_dep = cur.fetchall()
    arr = pd.to_datetime(arr_dep[0][0], unit="ms")
    dep = pd.to_datetime(arr_dep[0][1], unit="ms")

    # plotting: PLOT ALL THE THINGS
    for name in colnames:
        plt.figure(figsize=(14,8))
        mask = np.isfinite(df_new[name])
        plt.plot(df_new[mask].index.to_pydatetime(), df_new[name][mask], '-o')
        plt.title(name)
        plt.tick_params(labelsize=14)
        plt.xlim([arr.to_pydatetime(), dep.to_pydatetime()])

        # add vertical lines when RRT Events took place
        for val in RRT_times:
            plt.axvline(x=val.to_pydatetime(), linewidth = 2)

In [ ]:
timeseries_randomRRTpatient()